Notes:

  1. All whitespaces changed to underscores.
  2. File вид_ТГСК.xlsx переименован в gvhd_type.xlsx

Data reading

files_dd <- list.files(path = "./raw_data/3_Derivable_Datasets/", pattern = ".xls", all.files = FALSE, full.names = FALSE)
fdd <- do.call("<-", list(files_dd, sapply(files_dd, 
                                   function(x) read.xlsx(paste0("./raw_data/3_Derivable_Datasets/", x ), na.strings = NA))))

names(fdd) <- str_split_i(names(fdd), "\\.", 1)

## У файлов из второй папки общая "болезнь" -- пояснительная строка под названием столбцов
## Я не нашел тривиального решения в коде, поэтому просто убрал эту строку в экселях

files_fet <- list.files(path = "./raw_data/Final_export_tables/", pattern = ".xls", all.files = FALSE, full.names = FALSE)
fnet <- do.call("<-", list(files_fet, sapply(files_fet, 
                                   function(x) read.xlsx(paste0("./raw_data/Final_export_tables/", x), na.strings = NA, startRow = 1))))

names(fnet) <- str_split_i(names(fnet), "\\.", 1)

Dataframe building

Начнем плясать от печки собирать датасет от файла с демографическими данными. Затем добавим все сырые файлы, в которых единицей наблюдения является пациент (это датафреймы с данными о показаниях к пересадке, профилактикой, лечением и выживаемостью)

# Если вы не убирали вторую строку в экселях, раскомментируйте концы строчек кода ниже (и уберите лишнюю запятую)
common_df <- left_join(left_join(left_join(left_join(fnet$DM_20230119_120304, # %>% slice(-1),
                       fnet$TU_20230119_120304 %>% select(!SITE), # %>% slice(-1),
                       by = "SUBJID"),
                       fnet$PREV_20230119_120304 %>% select(!SITE), # %>% slice(-1),
                       by = "SUBJID"),
                       fnet$TR_20230119_120304 %>% select(!SITE), # %>% slice(-1),
                       by = "SUBJID"),
                       fnet$STAT_20230119_120304 %>% 
                       select(!SITE) %>% 
                       rename(ALIVE = PTSTAT)  , # %>% slice(-1),
                       by = "SUBJID")
## Добавим датасет по факту наличия той или иной формы РТПХ
## Предварительно упростим их и переведем в широкий формат

agvhd <- fnet$AGVHD_20230119_120304 %>% 
  pivot_wider(names_from = AGVHDLOC,
              values_from = AGVHDST,
              names_prefix = "acute_") %>%    # Удобно отличать повреждаемый орган + в обоих датасетах есть переменная Liver
  group_by(SUBJID) %>% 
  mutate(across(everything(), function(x) max(x, na.rm=TRUE))) %>% 
  ungroup() %>% 
  distinct(SUBJID, .keep_all = TRUE) %>% 
  select(!acute_NA)
## Warning: There were 240 warnings in `mutate()`.
## The first warning was:
## ℹ In argument: `across(everything(), function(x) max(x, na.rm = TRUE))`.
## ℹ In group 1: `SUBJID = "01001"`.
## Caused by warning in `max()`:
## ! no non-missing arguments, returning NA
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 239 remaining warnings.
cgvhd <- fnet$CGVHD_20230119_120304 %>% 
  pivot_wider(names_from = CGVHDLOC,
              values_from = DAMDEG,
              names_prefix = "chronic_") %>%    # Удобно отличать повреждаемый орган + в обоих датасетах есть переменная Liver
  group_by(SUBJID) %>% 
  mutate(across(everything(), function(x) max(x, na.rm=TRUE))) %>% 
  ungroup() %>% 
  distinct(SUBJID, .keep_all = TRUE) %>% 
  select(-c(chronic_NA, TYPEOTH))
## Warning: There were 394 warnings in `mutate()`.
## The first warning was:
## ℹ In argument: `across(everything(), function(x) max(x, na.rm = TRUE))`.
## ℹ In group 1: `SUBJID = "01003"`.
## Caused by warning in `max()`:
## ! no non-missing arguments to max; returning -Inf
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 393 remaining warnings.
fgvhd <- fnet$GVHD_20230119_120304 %>% 
  pivot_wider(names_from = GVHDCAT, values_from = GVHDYN) %>% 
  select(!GVHDOTHM) %>%    # Там одни NA
  filter(!is.na(GVHDDTC) | `Cross syndrome` == "yes")

Преобразуем типы переменных

common_df_patient <- common_df %>% 
  mutate(across(c(SITE,SEX,ICD,TUTERM,LLTC,LLTN,PTC,
                  PTN,PSOCC,PSOCN,TRTYPE,TRSOURCE,
                  CONDTYPE,ALIVE,RELAPYN), ~ as.factor(.x))) %>% 
  mutate(across(c(BIRTHDTC,PRSTDTC,PRENDTC,TRDTC,
                  LCDTC,DEATHDTC,RELAPDTC), ~format(as.Date(.x, format = "%d/%m/%Y"), "%d.%m.%Y"))) %>% 
  mutate(across(c(TIMGDOSE, HATGDOSE, TRNUM), ~ as.numeric(.x))) %>% 
  select(-c(LLTC, PTC, PSOCC, ICD, TUTERM, LLTN))   # Удалим ряд классификаций нозологии (оставим только PTCN)

Меняем ошибочную дату

common_df_patient <- common_df_patient %>% 
  mutate(PRSTDTC = if_else(SUBJID == "01095", "08.07.2021", PRSTDTC))

Для текущего датафрейма единица наблюдения – 1 пациент

Добавим информацию про острую, хроническую и оверлап РТПХ

## Объединим с уже имеющимися данными

common_df_disease <-   left_join(common_df_patient,
                       fgvhd %>% select(!SITE),
                       by="SUBJID")

## Разобьем на 3 датасета: острую, хроническую РТПХ и оверлап-синдром

common_df_disease_acute <- left_join(common_df_disease %>% 
  filter(`Acute GVHD` == "yes"), agvhd %>% select(!SITE), by = "SUBJID")

common_df_disease_chronic <- left_join(common_df_disease %>% 
  filter(`Chronic GVHD` == "yes"), cgvhd %>% select(!SITE), by = "SUBJID")

common_df_disease_cross <- common_df_disease %>% 
  filter(`Cross syndrome` == "yes")

common_df_disease <- full_join(full_join(common_df_disease_acute, 
          common_df_disease_chronic, 
          by = common_df_disease %>% colnames()),
          common_df_disease_cross,
          by = common_df_disease %>% colnames())

Преобразуем типы переменных.

## Добавим префикс acute и chronic к пораженным органам
common_df_disease <- common_df_disease %>% 
  mutate(
    TRIND = case_when(                     # Эта переменная нужна для последующего объединения с терапией
     `Acute GVHD` == "yes" ~ "acute GVHD",
     `Chronic GVHD` == "yes" ~ "chronic GVHD",
     `Cross syndrome` == "yes" ~ "cross syndrome"
    ) %>% as.factor()) %>% 
  mutate(
    across(c(AGVHDOCC,AGVHDGR,acute_Skin,acute_Liver,
             `acute_Upper gastrointestinal tract`,
             `acute_Lower gastrointestinal tract`,
             CGVHDOCC,PTSTAT,SEVTYPE,SEVGRADE,`chronic_Skin involvement, % lesion of body surface area`,
             `chronic_Sclerotic changes of the skin`,
             `chronic_Changes in oral cavity`,
             `chronic_Eyes`,
             `chronic_Gastrointestinal tract`, chronic_Liver, chronic_Lungs,
             `chronic_Lungs functional assessment`,
             `chronic_Joints and fascia`,
             `chronic_Sex organs`,GVHDMETH,
             `Acute GVHD`,`Chronic GVHD`,`Cross syndrome`), ~ as.factor(.x)),
    GVHDAGE = GVHDAGE %>% as.numeric(),
    GVHDDTC = format(as.Date(GVHDDTC, format = "%d/%m/%Y"), "%d.%m.%Y"),
    COND = case_when(  ## Переместим другую терапию в общий столбец и удалим столбец CONDOTH
      COND == "other" ~ CONDOTH, 
      COND != "other" ~  COND
     ) %>% as.factor(),
    PRSCHEM = case_when(   ## Переместим другую профилактику в общий столбец и удалим столбец SCHEMOTH
    PRSCHEM == "other" ~ SCHEMOTH, 
    PRSCHEM != "other" ~  PRSCHEM
    ),
    across(c(AGVHDOCC,CGVHDOCC), ~ case_when(
      is.na(.x) ~ "no",
      .x == "yes" ~ "yes"
    ))
) %>% 
  rename(           ## Переименуем ряд переменных
    Acute_GVHD = `Acute GVHD`,
    Chronic_GVHD = `Chronic GVHD`,
    Cross_syndrome = `Cross syndrome`,
    acute_UGT = `acute_Upper gastrointestinal tract`,
    acute_LGT = `acute_Lower gastrointestinal tract`,
    chronic_Skin_perc = `chronic_Skin involvement, % lesion of body surface area`,
    chronic_Skin_scl = `chronic_Sclerotic changes of the skin`,
    chronic_oral = `chronic_Changes in oral cavity`,
    chronic_GT =`chronic_Gastrointestinal tract`,
    chronic_Lungs_func = `chronic_Lungs functional assessment`,
    chronic_Joints = `chronic_Joints and fascia`,
    chronic_Sex = `chronic_Sex organs`
  ) %>% 
  select(-c(CONDOTH, Acute_GVHD, Chronic_GVHD, 
            Cross_syndrome, SCHEMOTH)) 

Для нашего текущего датафрейма единица наблюдения – наличие одного из типов РТПХ у одного пациента.

Вновь разобъем датасет на три субдатасета по заболеваниям

common_df_disease_acute <- common_df_disease %>% 
  filter(TRIND == "acute GVHD")

common_df_disease_chronic <- common_df_disease %>% 
  filter(TRIND == "chronic GVHD")

common_df_disease_cross <- common_df_disease %>% 
  filter(TRIND == "cross syndrome")

Добавим информацию о полученной терапии

common_df_treatment <- left_join(common_df_disease,
          fnet$CM_20230119_120304 %>% select(!SITE),
          by = c("SUBJID", "TRIND")) %>% 
  select(-c(REPDRUG, DRUGC, ATCC, INGR)) %>%     ### Уберем колонки с дублирующейся и второстепенной информацией
  mutate(
    across(c(TRSTDTC, TRENDTC), ~ format(as.Date(.x, format = "%d/%m/%Y"), "%d.%m.%Y")),
    across(c(GVHDTRYN, DRUGN, ATCN, LOT, TRONG, TRRESP, RESPEV), ~as.factor(.x)),
    STERRES = if_else(TRRESP %in% c("no response", 
                                   "progression") & ATCN %in% c("GLUCOCORTICOIDS",
                                                                "CORTICOSTEROIDS, POTENT (GROUP III)", 
                                                                "CORTICOSTEROIDS FOR SYSTEMIC USE"), "resistance", if_else((TRRESP != "no data" | TRRESP != "not estimated") & ATCN %in% c("GLUCOCORTICOIDS",
                                                        "CORTICOSTEROIDS, POTENT (GROUP III)", 
                                                        "CORTICOSTEROIDS FOR SYSTEMIC USE"), 
                                                        "no resistance", "no data")) %>% as.factor(),   # Новая колонка с данными о резистентности к стероидам
    across(c(GVHDTRYN,TRONG), ~ case_when(
      is.na(.x) ~ "no",
      .x == "yes" ~ "yes"
    ))
  )

Добавим новые переменные: длительность профилактики, длительность лечения, интервал от пересадки КМ до установления диагноза РТПХ, интервал от пересадки КМ до начала профилактики, интервал от конца профилактики до начала лечения.

#View(common_df_treatment_chronic)
common_df_treatment <- common_df_treatment %>% 
  mutate(
    TDINTER = (as.Date(GVHDDTC,        ## От пересадки до РТПХ
                format = "%d.%m.%Y") - as.Date(TRDTC, 
                                               format = "%d.%m.%Y")) %>% as.character %>% as.numeric,
    TPINTER = (as.Date(PRSTDTC,        ## От пересадки до профилактики
                format = "%d.%m.%Y") - as.Date(TRDTC, 
                                               format = "%d.%m.%Y")) %>% as.character %>% as.numeric,
    PTINTER = (as.Date(PRENDTC,        ## От конца профилатики до начала лечения
                format = "%d.%m.%Y") - as.Date(TRSTDTC, 
                                               format = "%d.%m.%Y")) %>% as.character %>% as.numeric,
    
    PRINTER = (as.Date(PRENDTC,        ## Профилактика
                format = "%d.%m.%Y") - as.Date(PRSTDTC, 
                                               format = "%d.%m.%Y")) %>% as.character %>% as.numeric,
    TRINTER = if_else(is.na(TRENDTC), (as.Date(LCDTC,        ## Лечение
                format = "%d.%m.%Y") - as.Date(TRSTDTC, 
                                               format = "%d.%m.%Y")) %>% as.character %>% as.numeric, (as.Date(TRENDTC, 
                format = "%d.%m.%Y") - as.Date(TRSTDTC, 
                                               format = "%d.%m.%Y")) %>% as.character %>% as.numeric)
  )

# common_df_treatment_chronic <- common_df_treatment_chronic %>% 
#   filter(TPINTER>=3 & TPINTER < 5)   # Удаляем людей, у которых профилактика началась ранее чем на 3-й день после трансплантации, и позднее, чем на 5-й.
### Пробная штука

test_drive <- common_df_treatment %>% 
  group_by(SUBJID, TRIND) %>% 
  mutate(LOT = max(is.numeric(LOT)),
         GLUK = if_else(ATCN %in% c("GLUCOCORTICOIDS", "CORTICOSTEROIDS, DERMATOLOGICAL PREPARATIONS",
                                    "CORTICOSTEROIDS FOR SYSTEMIC USE", "CORTICOSTEROIDS, POTENT (GROUP III)"), 1, 0),
         JAK = if_else(ATCN %in% c("JANUS-ASSOCIATED KINASE (JAK) INHIBITORS"),1,0),
         CALIN = if_else(ATCN %in% c("CALCINEURIN INHIBITORS"),1,0),
         OTHER = if_else(ATCN %in% c("OTHER IMMUNOSUPPRESSANTS",
                                     "SELECTIVE IMMUNOSUPPRESSANTS",
                                     "INTERLEUKINS",
                                     "BCR-ABL TYROSINE KINASE INHIBITORS",
                                     "INTERLEUKIN INHIBITORS",
                                     "OTHER IMMUNOSTIMULANTS",
                                     "TUMOR NECROSIS FACTOR ALPHA (TNF-) INHIBITORS",
                                     "IMIDAZOLE DERIVATIVES",
                                     "BRUTON'S TYROSINE KINASE (BTK) INHIBITORS",
                                     NA),1,0)) %>% 
  select(ATCN, GLUK, JAK, CALIN, OTHER)
## Adding missing grouping variables: `SUBJID`, `TRIND`

Разобьем полученные данные на три блока по типу заболевания (опять)

common_df_treatment_acute <- common_df_treatment %>% 
  filter(TRIND == "acute GVHD")

common_df_treatment_chronic <- common_df_treatment %>% 
  filter(TRIND == "chronic GVHD")

common_df_treatment_cross <- common_df_treatment %>% 
  filter(TRIND == "cross syndrome")

Adiitional transformations

Checking correctness of data

str(common_df_treatment)
## 'data.frame':    650 obs. of  61 variables:
##  $ SITE              : Factor w/ 4 levels "01","02","03",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ SUBJID            : chr  "01001" "01002" "01003" "01005" ...
##  $ BIRTHDTC          : chr  "11.01.1990" "05.05.1998" "30.09.1982" "28.09.1987" ...
##  $ SEX               : Factor w/ 2 levels "female","male": 1 2 1 1 2 1 2 2 2 1 ...
##  $ PTN               : Factor w/ 21 levels "ACUTE LYMPHOCYTIC LEUKAEMIA",..: 1 1 1 1 1 2 1 1 1 2 ...
##  $ PSOCN             : Factor w/ 2 levels "BLOOD AND LYMPHATIC SYSTEM DISORDERS",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ PRSTDTC           : chr  "07.09.2020" "21.04.2020" "08.05.2020" "20.02.2020" ...
##  $ PRSCHEM           : chr  "PT-Cy(+3,+5)+CSA+MMF45" "TCRaB-CD19" "hATG+CSA+MTX+MMF30" "PT-Cy (+3,+4)+CSA+MMF30" ...
##  $ TIMGDOSE          : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ HATGDOSE          : num  0 0 40 0 0 0 0 0 0 0 ...
##  $ PRENDTC           : chr  "01.06.2021" "20.05.2020" "17.05.2021" "25.10.2020" ...
##  $ TRNUM             : num  1 1 2 1 1 1 1 1 1 1 ...
##  $ TRTYPE            : Factor w/ 4 levels "Haplo","MMUD",..: 2 1 3 1 1 1 4 4 4 1 ...
##  $ TRDTC             : chr  "04.09.2020" "22.04.2020" "12.05.2020" "17.02.2020" ...
##  $ TRSOURCE          : Factor w/ 2 levels "BM","BSC": 2 2 2 2 2 2 2 2 2 2 ...
##  $ COND              : Factor w/ 55 levels "Bu12+Cy120","CY50",..: 42 30 45 28 42 29 20 20 20 19 ...
##  $ CONDTYPE          : Factor w/ 2 levels "MAC","RIC": 2 1 2 2 2 2 2 2 2 2 ...
##  $ ALIVE             : Factor w/ 3 levels "alive","died",..: 1 2 2 1 2 1 2 2 2 1 ...
##  $ LCDTC             : chr  "28.09.2022" NA NA "25.10.2022" ...
##  $ DEATHDTC          : chr  NA "06.04.2021" "17.05.2021" NA ...
##  $ RELAPYN           : Factor w/ 3 levels "no","unknown",..: 1 3 1 1 1 1 1 1 1 1 ...
##  $ RELAPDTC          : chr  NA "25.11.2020" NA NA ...
##  $ GVHDDTC           : chr  "26.10.2020" "17.07.2020" "26.06.2020" "06.03.2020" ...
##  $ GVHDAGE           : num  30 22 37 32 20 45 35 35 35 42 ...
##  $ GVHDMETH          : Factor w/ 5 levels "Glucksberg","IBMTR",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ AGVHDOCC          : chr  "yes" "yes" "yes" "yes" ...
##  $ AGVHDGR           : Factor w/ 4 levels "1","2","3","4": 1 1 3 1 3 1 3 3 3 1 ...
##  $ acute_Skin        : Factor w/ 5 levels "0","1","2","3",..: 3 2 4 3 2 2 3 3 3 2 ...
##  $ acute_Liver       : Factor w/ 5 levels "0","1","2","3",..: 1 1 3 1 4 1 3 3 3 1 ...
##  $ acute_UGT         : Factor w/ 5 levels "0","1","2","3",..: 1 1 1 1 1 1 4 4 4 1 ...
##  $ acute_LGT         : Factor w/ 5 levels "0","1","2","3",..: 1 1 2 1 1 1 4 4 4 1 ...
##  $ CGVHDOCC          : chr  "no" "no" "no" "no" ...
##  $ PTSTAT            : Factor w/ 4 levels "0","1","2","3": NA NA NA NA NA NA NA NA NA NA ...
##  $ SEVTYPE           : Factor w/ 3 levels "NIH2005","NIH2014",..: NA NA NA NA NA NA NA NA NA NA ...
##  $ SEVGRADE          : Factor w/ 3 levels "mild","moderate",..: NA NA NA NA NA NA NA NA NA NA ...
##  $ chronic_Skin_perc : Factor w/ 4 levels "0","1","2","3": NA NA NA NA NA NA NA NA NA NA ...
##  $ chronic_Skin_scl  : Factor w/ 4 levels "0","1","2","3": NA NA NA NA NA NA NA NA NA NA ...
##  $ chronic_oral      : Factor w/ 4 levels "0","1","2","3": NA NA NA NA NA NA NA NA NA NA ...
##  $ chronic_Eyes      : Factor w/ 4 levels "0","1","2","3": NA NA NA NA NA NA NA NA NA NA ...
##  $ chronic_GT        : Factor w/ 4 levels "0","1","2","3": NA NA NA NA NA NA NA NA NA NA ...
##  $ chronic_Liver     : Factor w/ 4 levels "0","1","2","3": NA NA NA NA NA NA NA NA NA NA ...
##  $ chronic_Lungs     : Factor w/ 4 levels "0","1","2","3": NA NA NA NA NA NA NA NA NA NA ...
##  $ chronic_Lungs_func: Factor w/ 4 levels "0","1","2","3": NA NA NA NA NA NA NA NA NA NA ...
##  $ chronic_Joints    : Factor w/ 3 levels "0","1","2": NA NA NA NA NA NA NA NA NA NA ...
##  $ chronic_Sex       : Factor w/ 4 levels "0","1","2","3": NA NA NA NA NA NA NA NA NA NA ...
##  $ TRIND             : chr  "acute GVHD" "acute GVHD" "acute GVHD" "acute GVHD" ...
##  $ GVHDTRYN          : chr  "yes" "no" "yes" "yes" ...
##  $ DRUGN             : Factor w/ 27 levels "ACTIVATED T-LYMPHOCYTES",..: 18 NA 14 14 14 NA 18 13 20 NA ...
##  $ ATCN              : Factor w/ 15 levels "BCR-ABL TYROSINE KINASE INHIBITORS",..: 7 NA 7 7 7 NA 7 13 11 NA ...
##  $ LOT               : Factor w/ 6 levels "1","2","3","4",..: 1 NA 1 1 1 NA 1 2 6 NA ...
##  $ TRSTDTC           : chr  "11.11.2020" NA "26.06.2020" "06.03.2020" ...
##  $ TRENDTC           : chr  "09.03.2021" NA "04.09.2020" "28.05.2020" ...
##  $ TRONG             : chr  "no" "no" "no" "no" ...
##  $ TRRESP            : Factor w/ 6 levels "complete response",..: 1 NA 2 1 1 NA 6 5 1 NA ...
##  $ RESPEV            : Factor w/ 3 levels "MAGIC","NIH2014",..: 1 NA 1 2 1 NA 2 2 2 NA ...
##  $ STERRES           : Factor w/ 3 levels "no data","no resistance",..: 2 1 3 2 2 1 3 1 1 1 ...
##  $ TDINTER           : num  52 86 45 18 46 109 47 47 47 33 ...
##  $ TPINTER           : num  3 -1 -4 3 3 0 -5 -5 -5 2 ...
##  $ PTINTER           : num  202 NA 325 233 103 NA 50 47 21 NA ...
##  $ PRINTER           : num  267 29 374 248 146 29 104 104 104 NA ...
##  $ TRINTER           : num  118 NA 70 83 70 NA 50 14 21 NA ...
summary(common_df_treatment)
##  SITE        SUBJID            BIRTHDTC             SEX     
##  01:200   Length:650         Length:650         female:304  
##  02:383   Class :character   Class :character   male  :346  
##  03:  1   Mode  :character   Mode  :character               
##  04: 66                                                     
##                                                             
##                                                             
##                                                             
##                           PTN     
##  ACUTE MYELOID LEUKAEMIA    :303  
##  ACUTE LYMPHOCYTIC LEUKAEMIA:145  
##  APLASTIC ANAEMIA           : 40  
##  CHRONIC MYELOID LEUKAEMIA  : 36  
##  MYELODYSPLASTIC SYNDROME   : 28  
##  PRIMARY MYELOFIBROSIS      : 18  
##  (Other)                    : 80  
##                                                                  PSOCN    
##  BLOOD AND LYMPHATIC SYSTEM DISORDERS                               : 40  
##  NEOPLASMS BENIGN, MALIGNANT AND UNSPECIFIED (INCL CYSTS AND POLYPS):610  
##                                                                           
##                                                                           
##                                                                           
##                                                                           
##                                                                           
##    PRSTDTC            PRSCHEM             TIMGDOSE          HATGDOSE      
##  Length:650         Length:650         Min.   : 0.0000   Min.   :  0.000  
##  Class :character   Class :character   1st Qu.: 0.0000   1st Qu.:  0.000  
##  Mode  :character   Mode  :character   Median : 0.0000   Median :  0.000  
##                                        Mean   : 0.3023   Mean   :  4.108  
##                                        3rd Qu.: 0.0000   3rd Qu.:  0.000  
##                                        Max.   :40.0000   Max.   :120.000  
##                                                                           
##    PRENDTC              TRNUM         TRTYPE       TRDTC           TRSOURCE 
##  Length:650         Min.   :1.000   Haplo:315   Length:650         BM :108  
##  Class :character   1st Qu.:1.000   MMUD :110   Class :character   BSC:542  
##  Mode  :character   Median :1.000   MRD  :106   Mode  :character            
##                     Mean   :1.182   MUD  :119                               
##                     3rd Qu.:1.000                                           
##                     Max.   :5.000                                           
##                                                                             
##           COND     CONDTYPE      ALIVE        LCDTC          
##  Flu180+Bu8 :159   MAC:130   alive  :465   Length:650        
##  Flu180+Bu10: 71   RIC:520   died   :177   Class :character  
##  Flu180+Bu12: 59             unknown:  8   Mode  :character  
##  Flu150+Bu8 : 54                                             
##  Flu180+Bu14: 39                                             
##  Flu90+Benda: 28                                             
##  (Other)    :240                                             
##    DEATHDTC            RELAPYN      RELAPDTC           GVHDDTC         
##  Length:650         no     :514   Length:650         Length:650        
##  Class :character   unknown: 11   Class :character   Class :character  
##  Mode  :character   yes    :125   Mode  :character   Mode  :character  
##                                                                        
##                                                                        
##                                                                        
##                                                                        
##     GVHDAGE              GVHDMETH     AGVHDOCC         AGVHDGR    acute_Skin
##  Min.   :12.00   Glucksberg  :188   Length:650         1   :102   0   : 73  
##  1st Qu.:24.00   IBMTR       :  1   Class :character   2   : 84   1   : 73  
##  Median :36.00   MAGIC       :146   Mode  :character   3   : 92   2   : 75  
##  Mean   :36.24   NIH2014     :279                      4   : 60   3   :100  
##  3rd Qu.:46.00   subjectively:  9                      NA's:312   4   : 17  
##  Max.   :76.00   NA's        : 27                                 NA's:312  
##  NA's   :33                                                                 
##  acute_Liver acute_UGT  acute_LGT    CGVHDOCC          PTSTAT   
##  0   :249    0   :312   0   :222   Length:650         0   : 41  
##  1   : 14    1   : 17   1   : 20   Class :character   1   : 83  
##  2   : 14    2   :  2   2   : 16   Mode  :character   2   : 76  
##  3   : 32    3   :  6   3   : 43                      3   : 85  
##  4   : 29    4   :  1   4   : 37                      NA's:365  
##  NA's:312    NA's:312   NA's:312                                
##                                                                 
##          SEVTYPE        SEVGRADE   chronic_Skin_perc chronic_Skin_scl
##  NIH2005     :  1   mild    : 49   0   : 80          0   :272        
##  NIH2014     :283   moderate: 91   1   : 83          1   :  8        
##  subjectively:  1   severe  :145   2   : 78          2   :  3        
##  NA's        :365   NA's    :365   3   : 44          3   :  2        
##                                    NA's:365          NA's:365        
##                                                                      
##                                                                      
##  chronic_oral chronic_Eyes chronic_GT chronic_Liver chronic_Lungs
##  0   :115     0   :154     0   :244   0   :168      0   :246     
##  1   : 99     1   : 77     1   :  8   1   : 17      1   : 13     
##  2   : 57     2   : 41     2   : 14   2   : 46      2   : 13     
##  3   : 14     3   : 13     3   : 19   3   : 54      3   : 13     
##  NA's:365     NA's:365     NA's:365   NA's:365      NA's:365     
##                                                                  
##                                                                  
##  chronic_Lungs_func chronic_Joints chronic_Sex    TRIND          
##  0   :254           0   :273       0   :257    Length:650        
##  1   : 12           1   :  8       1   : 16    Class :character  
##  2   :  7           2   :  4       2   :  4    Mode  :character  
##  3   : 12           NA's:365       3   :  8                      
##  NA's:365                          NA's:365                      
##                                                                  
##                                                                  
##    GVHDTRYN                        DRUGN    
##  Length:650         METHYLPREDNISOLONE:138  
##  Class :character   RUXOLITINIB       :105  
##  Mode  :character   PREDNISOLONE      : 61  
##                     TACROLIMUS        : 43  
##                     CICLOSPORIN       : 29  
##                     (Other)           : 86  
##                     NA's              :188  
##                                             ATCN       LOT     
##  GLUCOCORTICOIDS                              :201   1   :305  
##  JANUS-ASSOCIATED KINASE (JAK) INHIBITORS     :105   2   :106  
##  CALCINEURIN INHIBITORS                       : 72   3   : 38  
##  SELECTIVE IMMUNOSUPPRESSANTS                 : 38   4   :  9  
##  TUMOR NECROSIS FACTOR ALPHA (TNF-) INHIBITORS: 22   5   :  3  
##  (Other)                                      : 24   A   :  2  
##  NA's                                         :188   NA's:187  
##    TRSTDTC            TRENDTC             TRONG          
##  Length:650         Length:650         Length:650        
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##                                                          
##                TRRESP             RESPEV             STERRES       TDINTER     
##  complete response:178   MAGIC       : 34   no data      :446   Min.   :  3.0  
##  no response      : 99   NIH2014     :407   no resistance:142   1st Qu.: 37.0  
##  not estimated    : 14   subjectively:  6   resistance   : 62   Median : 89.0  
##  other            :  1   NA's        :203                       Mean   :119.5  
##  partial response :153                                          3rd Qu.:169.0  
##  progression      : 18                                          Max.   :645.0  
##  NA's             :187                                          NA's   :33     
##     TPINTER           PTINTER           PRINTER       TRINTER     
##  Min.   : -8.000   Min.   :-635.00   Min.   :  1   Min.   : -2.0  
##  1st Qu.: -1.000   1st Qu.: -81.00   1st Qu.: 72   1st Qu.: 33.0  
##  Median :  3.000   Median : -17.00   Median :120   Median : 70.0  
##  Mean   :  1.654   Mean   : -19.85   Mean   :138   Mean   :136.0  
##  3rd Qu.:  3.000   3rd Qu.:  50.00   3rd Qu.:192   3rd Qu.:187.8  
##  Max.   :125.000   Max.   : 509.00   Max.   :600   Max.   :810.0  
##  NA's   :3         NA's   :403       NA's   :289   NA's   :192

EDA

Подсчитаем описательные статистики

Нам интересно посмотреть на следующие переменные Факторные: SEX (пол), TUTERM (диагноз), PSOCN (группа заболеваний) TIMGDOSE? (есть-нет), HATGDOSE (есть-нет), TRTYPE (тип донора), TRSOURCE (источник КМ), ALIVE (выжил ли пациент), RELAPYN (был ли рецидив), PTSTAT (состояние пациента при обследовании), SEVGRADE? (степень тяжести), все, что начинается с chronic? (проявления заболевания), STERRES (развилась ли резистентность к стероидам) Числовые: TRNUM (число пересадок), GVHDAGE (возраст начала заболевания), все, что заканчивается на INTER (временные интервалы)

# Функция подсчитывает число единиц наблюдения для градации фактора,
# долю наблюдений с данной градацией от всех наблюдений,
# доверительный интервал (по Уилсону) для этой доли
catStat <- function(df, factorr){
  df %>% 
    select(variant = {{  factorr }}) %>% 
    mutate(variant = as.character(variant) %>%  replace_na("no_data") %>% as.factor()) %>% 
    count(variant) %>% 
    rename(number = n) %>% 
    mutate(proportionIntoGroup = round(number/sum(number),3),
           proportionCI = paste(
             round(binconf(number, sum(number), method = "wilson")[,2], 3),
             round(binconf(number, sum(number), method = "wilson")[,3], 3),
             sep="-"),
           variable = factorr) %>% 
  relocate(variable, .after=1)
}

### Функция стандартной ошибки
se <- function(x){
  sd(x, na.rm=TRUE)/sqrt(length(x))
}

### Вычисляем описательные статистики для нумерических переменных
statistics <- list(
  Counts = ~ length(.x) %>% as.character(),
  NAs = ~ sum(is.na(.x)) %>% as.character(),
  Mean = ~ mean(.x, na.rm=TRUE) %>% round(3) %>% as.character(),
  SD = ~ sd(.x, na.rm=TRUE) %>% round(3) %>% as.character(),
  CI95 = ~ paste(round(mean(.x, na.rm=TRUE) - 1.96 * se(.x), 3),
                    round(mean(.x, na.rm=TRUE) + 1.96 * se(.x), 3), sep="-"),
  Median = ~ median(.x, na.rm=TRUE) %>% round(3) %>% as.character(),
  Quantiles = ~ paste(round(quantile(.x, probs=c(0.25), na.rm=TRUE), 3), 
                        round(quantile(.x, probs=c(0.75), na.rm=TRUE), 3), sep="-"),
  Iqr = ~ IQR(.x, na.rm=TRUE) %>% round(3) %>% as.character(),
  Min = ~ min(.x, na.rm=TRUE) %>% round(3) %>% as.character(),
  Max = ~ max(.x, na.rm=TRUE) %>% round(3) %>% as.character()
  )

По-хорошему весь нижележащий код надо обернуть в функции

factor_patient_table <- lapply(common_df_patient %>% 
         select(where(is.factor)) %>% 
         colnames, 
       function(x) catStat(common_df_patient, x) %>% 
        as.data.frame) %>% 
  do.call(rbind, .) %>% 
  select(variable, variant, number, proportionIntoGroup, proportionCI) %>% 
  rename(group_proportion = proportionIntoGroup,
         proportion_CI = proportionCI)
## Warning: Using an external vector in selections was deprecated in tidyselect 1.1.0.
## ℹ Please use `all_of()` or `any_of()` instead.
##   # Was:
##   data %>% select(x)
## 
##   # Now:
##   data %>% select(all_of(x))
## 
## See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
numeric_patient_table <- common_df_patient %>% 
  summarise(across(is.numeric, statistics)) %>%
  stack %>% 
  rename(value = values) %>% 
  separate(ind, sep = "_", into = c("variable", "statistic")) %>% 
  pivot_wider(
    names_from = variable,
    values_from = value
  )
## Warning: There was 1 warning in `summarise()`.
## ℹ In argument: `across(is.numeric, statistics)`.
## Caused by warning:
## ! Use of bare predicate functions was deprecated in tidyselect 1.1.0.
## ℹ Please use wrap predicates in `where()` instead.
##   # Was:
##   data %>% select(is.numeric)
## 
##   # Now:
##   data %>% select(where(is.numeric))
factor_disease_acute_table <- lapply(common_df_disease %>% filter(TRIND == "acute GVHD") %>% 
         select(where(is.factor)) %>% 
         colnames, 
       function(x) catStat(common_df_disease %>% filter(TRIND == "acute GVHD"), x) %>% 
        as.data.frame) %>% 
  do.call(rbind, .) %>% 
  select(variable, variant, number, proportionIntoGroup, proportionCI) %>% 
  rename(group_proportion = proportionIntoGroup,
         proportion_CI = proportionCI)

numeric_disease_acute_table <- common_df_disease %>% filter(TRIND == "acute GVHD") %>%
  summarise(across(is.numeric, statistics)) %>%
  stack %>% 
  rename(value = values) %>% 
  separate(ind, sep = "_", into = c("variable", "statistic")) %>% 
  pivot_wider(
    names_from = variable,
    values_from = value
  )

factor_disease_chronic_table <- lapply(common_df_disease %>% filter(TRIND == "chronic GVHD") %>% 
         select(where(is.factor)) %>% 
         colnames, 
       function(x) catStat(common_df_disease %>% filter(TRIND == "chronic GVHD"), x) %>% 
        as.data.frame) %>% 
  do.call(rbind, .) %>% 
  select(variable, variant, number, proportionIntoGroup, proportionCI) %>% 
  rename(group_proportion = proportionIntoGroup,
         proportion_CI = proportionCI)

numeric_disease_chronic_table <- common_df_disease %>% filter(TRIND == "chronic GVHD") %>%
  summarise(across(is.numeric, statistics)) %>%
  stack %>% 
  rename(value = values) %>% 
  separate(ind, sep = "_", into = c("variable", "statistic")) %>% 
  pivot_wider(
    names_from = variable,
    values_from = value
  )

factor_disease_cross_table <- lapply(common_df_disease %>% filter(TRIND == "cross syndrome") %>% 
         select(where(is.factor)) %>% 
         colnames, 
       function(x) catStat(common_df_disease %>% filter(TRIND == "cross syndrome"), x) %>% 
        as.data.frame) %>% 
  do.call(rbind, .) %>% 
  select(variable, variant, number, proportionIntoGroup, proportionCI) %>% 
  rename(group_proportion = proportionIntoGroup,
         proportion_CI = proportionCI)

numeric_disease_cross_table <- common_df_disease %>% filter(TRIND == "cross syndrome") %>%
  summarise(across(is.numeric, statistics)) %>%
  stack %>% 
  rename(value = values) %>% 
  separate(ind, sep = "_", into = c("variable", "statistic")) %>% 
  pivot_wider(
    names_from = variable,
    values_from = value
)
## Warning: There were 2 warnings in `summarise()`.
## The first warning was:
## ℹ In argument: `across(is.numeric, statistics)`.
## Caused by warning in `min()`:
## ! no non-missing arguments to min; returning Inf
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 1 remaining warning.
factor_treatment_acute_table <- lapply(common_df_treatment %>% filter(TRIND == "acute GVHD") %>% 
         select(where(is.factor)) %>% 
         colnames, 
       function(x) catStat(common_df_treatment %>% filter(TRIND == "acute GVHD"), x) %>% 
        as.data.frame) %>% 
  do.call(rbind, .) %>% 
  select(variable, variant, number, proportionIntoGroup, proportionCI) %>% 
  rename(group_proportion = proportionIntoGroup,
         proportion_CI = proportionCI)

numeric_treatment_acute_table <- common_df_treatment %>% filter(TRIND == "acute GVHD") %>%
  summarise(across(is.numeric, statistics)) %>%
  stack %>% 
  rename(value = values) %>% 
  separate(ind, sep = "_", into = c("variable", "statistic")) %>% 
  pivot_wider(
    names_from = variable,
    values_from = value
  )

factor_treatment_chronic_table <- lapply(common_df_treatment %>% filter(TRIND == "chronic GVHD") %>% 
         select(where(is.factor)) %>% 
         colnames, 
       function(x) catStat(common_df_treatment %>% filter(TRIND == "chronic GVHD"), x) %>% 
        as.data.frame) %>% 
  do.call(rbind, .) %>% 
  select(variable, variant, number, proportionIntoGroup, proportionCI) %>% 
  rename(group_proportion = proportionIntoGroup,
         proportion_CI = proportionCI)

numeric_treatment_chronic_table <- common_df_treatment %>% filter(TRIND == "chronic GVHD") %>%
  summarise(across(is.numeric, statistics)) %>%
  stack %>% 
  rename(value = values) %>% 
  separate(ind, sep = "_", into = c("variable", "statistic")) %>% 
  pivot_wider(
    names_from = variable,
    values_from = value
  )

factor_treatment_cross_table <- lapply(common_df_treatment %>% filter(TRIND == "cross syndrome") %>% 
         select(where(is.factor)) %>% 
         colnames, 
       function(x) catStat(common_df_treatment %>% filter(TRIND == "cross syndrome"), x) %>% 
        as.data.frame) %>% 
  do.call(rbind, .) %>% 
  select(variable, variant, number, proportionIntoGroup, proportionCI) %>% 
  rename(group_proportion = proportionIntoGroup,
         proportion_CI = proportionCI)

numeric_treatment_cross_table <- common_df_treatment %>% filter(TRIND == "cross syndrome") %>%
  summarise(across(is.numeric, statistics)) %>%
  stack %>% 
  rename(value = values) %>% 
  separate(ind, sep = "_", into = c("variable", "statistic")) %>% 
  pivot_wider(
    names_from = variable,
    values_from = value
)
## Warning: There were 4 warnings in `summarise()`.
## The first warning was:
## ℹ In argument: `across(is.numeric, statistics)`.
## Caused by warning in `min()`:
## ! no non-missing arguments to min; returning Inf
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 3 remaining warnings.

Visualisation

bar_custom <- function(df, variable, filt="green"){
  ggplot(df)+
    geom_bar(aes(x = pull(df[variable])), fill=filt, colour="black")+
    labs(x = glue("Градации переменной {variable}"),
       y = "Количество",
       title = glue("Частоты единиц наблюдения по категориям\nпеременной {variable}"))+
    theme_bw()+
    theme(axis.text.x = element_text(size=14, angle = 25),
          axis.text.y = element_text(size=14),
          axis.title.x = element_text(size=18),
          axis.title.y = element_text(size=18),
          plot.title = element_text(size=22, hjust=0.5))
}

factor_count <- function(df, variable1, variable2){
  ggplot(df)+
    geom_count(aes(x = pull(df[variable1]), y = pull(df[variable2])))+
    labs(x = glue("Градации переменной {variable1}"),
       y =  glue("Градации переменной {variable2}"),
       title = glue("Количество единиц наблюдения по категориям\nпеременной {variable1} и {variable2}"))+
    theme_bw()+
    theme(axis.text.x = element_text(size=14, angle = 25),
          axis.text.y = element_text(size=14),
          axis.title.x = element_text(size=18),
          axis.title.y = element_text(size=18),
          plot.title = element_text(size=22, hjust=0.5))
}


# Подумать насчет аналога каунтов для процентов

# Общий график для дат
print("Барплоты по пациентам")
## [1] "Барплоты по пациентам"
lapply(common_df_patient %>% 
         select(SITE, SEX, PTN, PSOCN, 
                TRTYPE, TRSOURCE, CONDTYPE,
                RELAPYN, ALIVE) %>% 
         colnames, function(x) bar_custom(common_df_patient, x))
## [[1]]

## 
## [[2]]

## 
## [[3]]

## 
## [[4]]

## 
## [[5]]

## 
## [[6]]

## 
## [[7]]

## 
## [[8]]

## 
## [[9]]

print("Барплоты по заболеванию")
## [1] "Барплоты по заболеванию"
lapply(common_df_disease_chronic %>% 
         select(PTSTAT, SEVGRADE) %>% 
         colnames, function(x) bar_custom(common_df_disease_chronic, x, "red"))
## [[1]]

## 
## [[2]]

print("Барплоты по терапии")
## [1] "Барплоты по терапии"
lapply(common_df_treatment_chronic %>% 
         select(LOT, STERRES, TRRESP, ATCN) %>% 
         colnames, function(x) bar_custom(common_df_treatment_chronic, x, "blue"))
## [[1]]

## 
## [[2]]

## 
## [[3]]

## 
## [[4]]

print("Каунтплоты по выживаемости -- пациенты")
## [1] "Каунтплоты по выживаемости -- пациенты"
lapply(common_df_patient %>% 
    select(SITE, SEX, PTN, PSOCN, 
                TRTYPE, TRSOURCE, CONDTYPE,
                RELAPYN) %>%
    colnames, function(x) factor_count(common_df_patient, x, "ALIVE"))
## [[1]]

## 
## [[2]]

## 
## [[3]]

## 
## [[4]]

## 
## [[5]]

## 
## [[6]]

## 
## [[7]]

## 
## [[8]]

print("Каунтплоты по выживаемости -- заболевание")
## [1] "Каунтплоты по выживаемости -- заболевание"
lapply(common_df_disease_chronic %>% 
         select(PTSTAT, SEVGRADE) %>%
         colnames, function(x) factor_count(common_df_disease_chronic, x, "ALIVE"))
## [[1]]

## 
## [[2]]

print("Каунтплоты по выживаемости -- терапия")
## [1] "Каунтплоты по выживаемости -- терапия"
lapply(common_df_treatment_chronic %>% 
         select(LOT, STERRES, TRRESP, ATCN) %>% 
         colnames, function(x) factor_count(common_df_treatment_chronic, x, "ALIVE"))
## [[1]]

## 
## [[2]]

## 
## [[3]]

## 
## [[4]]

print("Каунтплоты по ОvsХ -- заболевание")
## [1] "Каунтплоты по ОvsХ -- заболевание"
# lapply(common_df_disease %>% 
#          filter(#уточнить дату окончания исследования#)
#          select(SITE, SEX, PTN, PSOCN, 
#                 TRTYPE, TRSOURCE, CONDTYPE,
#                 RELAPYN, PTSTAT, SEVGRADE))

print("Резистентность к стероидам -- терапия")
## [1] "Резистентность к стероидам -- терапия"
lapply(common_df_treatment_chronic %>% 
         select(SITE, SEX, PTN, PSOCN, 
                TRTYPE, TRSOURCE, CONDTYPE,
                RELAPYN, ALIVE, PTSTAT, SEVGRADE,
                LOT, TRRESP, ATCN) %>% 
         colnames, function(x) factor_count(common_df_treatment_chronic, x, "STERRES"))
## [[1]]

## 
## [[2]]

## 
## [[3]]

## 
## [[4]]

## 
## [[5]]

## 
## [[6]]

## 
## [[7]]

## 
## [[8]]

## 
## [[9]]

## 
## [[10]]

## 
## [[11]]

## 
## [[12]]

## 
## [[13]]

## 
## [[14]]

# ggplot(common_df_disease_chronic)+
#     geom_bar(aes(x = pull(common_df_disease_chronic["PTSTAT"])), fill="green", colour="black")+
#     # labs(x = glue("Градации переменной {variable}"),
#     #    y = "Количество",
#     #    title = glue("Частоты единиц наблюдения по категориям\nпеременной {variable}"))+
#     theme_bw()+
#     theme(axis.text.x = element_text(size=14, angle = 25),
#           axis.text.y = element_text(size=14),
#           axis.title.x = element_text(size=18),
#           axis.title.y = element_text(size=18),
#           plot.title = element_text(size=22, hjust=0.5))

Боксплоты для продолжительности временных интервалов

#нбд Нормально оформить подписи и названия графиков
boxpCreator_inter <- function(df, y){
  ggplot(df)+
    geom_boxplot(aes(y=pull(df[y])), 
                 fill = "green",
                 color="black")+
    labs(y = "Количество дней")+
    theme_bw()+
    theme(axis.text.y = element_text(size=16),
        axis.title.x = element_text(size=19),
        axis.title.y = element_text(size=19),
        plot.title = element_text(size=20, hjust=0.5),
        legend.title = element_text(size=21),
        legend.text = element_text(size=18))
}

lapply(common_df_treatment_chronic %>% 
         select(ends_with("INTER")) %>% 
         colnames, function(y) boxpCreator_inter(common_df_treatment_chronic, y))
## [[1]]
## Warning: Removed 6 rows containing non-finite values (`stat_boxplot()`).

## 
## [[2]]

## 
## [[3]]
## Warning: Removed 184 rows containing non-finite values (`stat_boxplot()`).

## 
## [[4]]
## Warning: Removed 142 rows containing non-finite values (`stat_boxplot()`).

## 
## [[5]]
## Warning: Removed 77 rows containing non-finite values (`stat_boxplot()`).

Другой вариант табличек

common_df_patient %>% 
         select(SITE, SEX, PTN, PSOCN, 
                TRTYPE, TRSOURCE, CONDTYPE,
                RELAPYN, ALIVE) %>% 
  #filter(ALIVE != "unknown") %>% 
  tbl_summary(by=ALIVE) #%>% 
Characteristic alive, N = 2521 died, N = 861 unknown, N = 51
SITE
    01 77 (31%) 35 (41%) 5 (100%)
    02 150 (60%) 40 (47%) 0 (0%)
    03 1 (0.4%) 0 (0%) 0 (0%)
    04 24 (9.5%) 11 (13%) 0 (0%)
SEX
    female 137 (54%) 38 (44%) 2 (40%)
    male 115 (46%) 48 (56%) 3 (60%)
PTN
    ACUTE LYMPHOCYTIC LEUKAEMIA 54 (21%) 26 (30%) 5 (100%)
    ACUTE MYELOID LEUKAEMIA 114 (45%) 38 (44%) 0 (0%)
    ACUTE MYELOMONOCYTIC LEUKAEMIA 1 (0.4%) 0 (0%) 0 (0%)
    ACUTE PROMYELOCYTIC LEUKAEMIA 1 (0.4%) 1 (1.2%) 0 (0%)
    ANAPLASTIC LARGE-CELL LYMPHOMA 1 (0.4%) 0 (0%) 0 (0%)
    ANGIOIMMUNOBLASTIC T-CELL LYMPHOMA 1 (0.4%) 0 (0%) 0 (0%)
    APLASTIC ANAEMIA 18 (7.1%) 6 (7.0%) 0 (0%)
    BURKITT'S LYMPHOMA 0 (0%) 1 (1.2%) 0 (0%)
    CHRONIC LYMPHOCYTIC LEUKAEMIA 6 (2.4%) 0 (0%) 0 (0%)
    CHRONIC MYELOID LEUKAEMIA 14 (5.6%) 3 (3.5%) 0 (0%)
    CHRONIC MYELOMONOCYTIC LEUKAEMIA 2 (0.8%) 0 (0%) 0 (0%)
    DIFFUSE LARGE B-CELL LYMPHOMA 2 (0.8%) 2 (2.3%) 0 (0%)
    HODGKIN'S DISEASE 4 (1.6%) 2 (2.3%) 0 (0%)
    MANTLE CELL LYMPHOMA 2 (0.8%) 3 (3.5%) 0 (0%)
    MYELODYSPLASTIC SYNDROME 15 (6.0%) 3 (3.5%) 0 (0%)
    PERIPHERAL T-CELL LYMPHOMA UNSPECIFIED 3 (1.2%) 0 (0%) 0 (0%)
    PLASMA CELL MYELOMA 1 (0.4%) 0 (0%) 0 (0%)
    PRIMARY MEDIASTINAL LARGE B-CELL LYMPHOMA 1 (0.4%) 0 (0%) 0 (0%)
    PRIMARY MYELOFIBROSIS 9 (3.6%) 0 (0%) 0 (0%)
    T-CELL LYMPHOMA 1 (0.4%) 0 (0%) 0 (0%)
    T-CELL TYPE ACUTE LEUKAEMIA 2 (0.8%) 1 (1.2%) 0 (0%)
PSOCN
    BLOOD AND LYMPHATIC SYSTEM DISORDERS 18 (7.1%) 6 (7.0%) 0 (0%)
    NEOPLASMS BENIGN, MALIGNANT AND UNSPECIFIED (INCL CYSTS AND POLYPS) 234 (93%) 80 (93%) 5 (100%)
TRTYPE
    Haplo 97 (38%) 55 (64%) 4 (80%)
    MMUD 49 (19%) 12 (14%) 0 (0%)
    MRD 55 (22%) 9 (10%) 1 (20%)
    MUD 51 (20%) 10 (12%) 0 (0%)
TRSOURCE
    BM 36 (14%) 11 (13%) 0 (0%)
    BSC 216 (86%) 75 (87%) 5 (100%)
CONDTYPE
    MAC 30 (12%) 17 (20%) 1 (20%)
    RIC 222 (88%) 69 (80%) 4 (80%)
RELAPYN
    no 218 (87%) 48 (56%) 0 (0%)
    unknown 3 (1.2%) 4 (4.7%) 0 (0%)
    yes 31 (12%) 34 (40%) 5 (100%)
1 n (%)
  #add_p()
common_df_disease_chronic %>% 
         select(SITE, SEX, PTN, PSOCN, 
                TRTYPE, TRSOURCE, CONDTYPE,
                RELAPYN, ALIVE, 
                PTSTAT, SEVGRADE) %>% 
  #filter(ALIVE != "unknown")
  tbl_summary(by=ALIVE) #%>% 
Characteristic alive, N = 1671 died, N = 271 unknown, N = 31
SITE
    01 59 (35%) 9 (33%) 3 (100%)
    02 95 (57%) 16 (59%) 0 (0%)
    03 0 (0%) 0 (0%) 0 (0%)
    04 13 (7.8%) 2 (7.4%) 0 (0%)
SEX
    female 88 (53%) 12 (44%) 1 (33%)
    male 79 (47%) 15 (56%) 2 (67%)
PTN
    ACUTE LYMPHOCYTIC LEUKAEMIA 35 (21%) 5 (19%) 3 (100%)
    ACUTE MYELOID LEUKAEMIA 76 (46%) 14 (52%) 0 (0%)
    ACUTE MYELOMONOCYTIC LEUKAEMIA 0 (0%) 0 (0%) 0 (0%)
    ACUTE PROMYELOCYTIC LEUKAEMIA 1 (0.6%) 1 (3.7%) 0 (0%)
    ANAPLASTIC LARGE-CELL LYMPHOMA 0 (0%) 0 (0%) 0 (0%)
    ANGIOIMMUNOBLASTIC T-CELL LYMPHOMA 1 (0.6%) 0 (0%) 0 (0%)
    APLASTIC ANAEMIA 12 (7.2%) 2 (7.4%) 0 (0%)
    BURKITT'S LYMPHOMA 0 (0%) 0 (0%) 0 (0%)
    CHRONIC LYMPHOCYTIC LEUKAEMIA 2 (1.2%) 0 (0%) 0 (0%)
    CHRONIC MYELOID LEUKAEMIA 9 (5.4%) 3 (11%) 0 (0%)
    CHRONIC MYELOMONOCYTIC LEUKAEMIA 1 (0.6%) 0 (0%) 0 (0%)
    DIFFUSE LARGE B-CELL LYMPHOMA 1 (0.6%) 1 (3.7%) 0 (0%)
    HODGKIN'S DISEASE 3 (1.8%) 1 (3.7%) 0 (0%)
    MANTLE CELL LYMPHOMA 2 (1.2%) 0 (0%) 0 (0%)
    MYELODYSPLASTIC SYNDROME 10 (6.0%) 0 (0%) 0 (0%)
    PERIPHERAL T-CELL LYMPHOMA UNSPECIFIED 2 (1.2%) 0 (0%) 0 (0%)
    PLASMA CELL MYELOMA 1 (0.6%) 0 (0%) 0 (0%)
    PRIMARY MEDIASTINAL LARGE B-CELL LYMPHOMA 0 (0%) 0 (0%) 0 (0%)
    PRIMARY MYELOFIBROSIS 8 (4.8%) 0 (0%) 0 (0%)
    T-CELL LYMPHOMA 1 (0.6%) 0 (0%) 0 (0%)
    T-CELL TYPE ACUTE LEUKAEMIA 2 (1.2%) 0 (0%) 0 (0%)
PSOCN
    BLOOD AND LYMPHATIC SYSTEM DISORDERS 12 (7.2%) 2 (7.4%) 0 (0%)
    NEOPLASMS BENIGN, MALIGNANT AND UNSPECIFIED (INCL CYSTS AND POLYPS) 155 (93%) 25 (93%) 3 (100%)
TRTYPE
    Haplo 65 (39%) 17 (63%) 2 (67%)
    MMUD 29 (17%) 5 (19%) 0 (0%)
    MRD 40 (24%) 4 (15%) 1 (33%)
    MUD 33 (20%) 1 (3.7%) 0 (0%)
TRSOURCE
    BM 24 (14%) 5 (19%) 0 (0%)
    BSC 143 (86%) 22 (81%) 3 (100%)
CONDTYPE
    MAC 22 (13%) 5 (19%) 1 (33%)
    RIC 145 (87%) 22 (81%) 2 (67%)
RELAPYN
    no 149 (89%) 15 (56%) 0 (0%)
    unknown 2 (1.2%) 1 (3.7%) 0 (0%)
    yes 16 (9.6%) 11 (41%) 3 (100%)
PTSTAT
    0 28 (17%) 1 (3.7%) 1 (33%)
    1 53 (32%) 7 (26%) 1 (33%)
    2 44 (26%) 12 (44%) 1 (33%)
    3 42 (25%) 7 (26%) 0 (0%)
SEVGRADE
    mild 43 (26%) 5 (19%) 1 (33%)
    moderate 59 (35%) 9 (33%) 0 (0%)
    severe 65 (39%) 13 (48%) 2 (67%)
1 n (%)
 #add_p()
common_df_treatment_chronic %>% 
         select(SITE, SEX, PTN, PSOCN, 
                TRTYPE, TRSOURCE, CONDTYPE,
                RELAPYN, ALIVE, PTSTAT, SEVGRADE,
                LOT, TRRESP, ATCN, STERRES) %>% 
         #filter(STERRES != "no data") %>% 
  tbl_summary(by=STERRES) #%>% 
Characteristic no data, N = 2051 no resistance, N = 611 resistance, N = 191
SITE
    01 44 (21%) 40 (66%) 8 (42%)
    02 148 (72%) 19 (31%) 11 (58%)
    03 0 (0%) 0 (0%) 0 (0%)
    04 13 (6.3%) 2 (3.3%) 0 (0%)
SEX
    female 103 (50%) 26 (43%) 10 (53%)
    male 102 (50%) 35 (57%) 9 (47%)
PTN
    ACUTE LYMPHOCYTIC LEUKAEMIA 41 (20%) 14 (23%) 2 (11%)
    ACUTE MYELOID LEUKAEMIA 99 (48%) 23 (38%) 9 (47%)
    ACUTE MYELOMONOCYTIC LEUKAEMIA 0 (0%) 0 (0%) 0 (0%)
    ACUTE PROMYELOCYTIC LEUKAEMIA 2 (1.0%) 0 (0%) 0 (0%)
    ANAPLASTIC LARGE-CELL LYMPHOMA 0 (0%) 0 (0%) 0 (0%)
    ANGIOIMMUNOBLASTIC T-CELL LYMPHOMA 0 (0%) 1 (1.6%) 0 (0%)
    APLASTIC ANAEMIA 14 (6.8%) 2 (3.3%) 2 (11%)
    BURKITT'S LYMPHOMA 0 (0%) 0 (0%) 0 (0%)
    CHRONIC LYMPHOCYTIC LEUKAEMIA 2 (1.0%) 0 (0%) 0 (0%)
    CHRONIC MYELOID LEUKAEMIA 11 (5.4%) 4 (6.6%) 2 (11%)
    CHRONIC MYELOMONOCYTIC LEUKAEMIA 4 (2.0%) 1 (1.6%) 0 (0%)
    DIFFUSE LARGE B-CELL LYMPHOMA 5 (2.4%) 0 (0%) 3 (16%)
    HODGKIN'S DISEASE 6 (2.9%) 1 (1.6%) 1 (5.3%)
    MANTLE CELL LYMPHOMA 2 (1.0%) 0 (0%) 0 (0%)
    MYELODYSPLASTIC SYNDROME 6 (2.9%) 6 (9.8%) 0 (0%)
    PERIPHERAL T-CELL LYMPHOMA UNSPECIFIED 1 (0.5%) 1 (1.6%) 0 (0%)
    PLASMA CELL MYELOMA 1 (0.5%) 1 (1.6%) 0 (0%)
    PRIMARY MEDIASTINAL LARGE B-CELL LYMPHOMA 0 (0%) 0 (0%) 0 (0%)
    PRIMARY MYELOFIBROSIS 7 (3.4%) 4 (6.6%) 0 (0%)
    T-CELL LYMPHOMA 2 (1.0%) 1 (1.6%) 0 (0%)
    T-CELL TYPE ACUTE LEUKAEMIA 2 (1.0%) 2 (3.3%) 0 (0%)
PSOCN
    BLOOD AND LYMPHATIC SYSTEM DISORDERS 14 (6.8%) 2 (3.3%) 2 (11%)
    NEOPLASMS BENIGN, MALIGNANT AND UNSPECIFIED (INCL CYSTS AND POLYPS) 191 (93%) 59 (97%) 17 (89%)
TRTYPE
    Haplo 93 (45%) 24 (39%) 7 (37%)
    MMUD 42 (20%) 9 (15%) 3 (16%)
    MRD 34 (17%) 19 (31%) 3 (16%)
    MUD 36 (18%) 9 (15%) 6 (32%)
TRSOURCE
    BM 43 (21%) 8 (13%) 4 (21%)
    BSC 162 (79%) 53 (87%) 15 (79%)
CONDTYPE
    MAC 48 (23%) 9 (15%) 4 (21%)
    RIC 157 (77%) 52 (85%) 15 (79%)
RELAPYN
    no 170 (83%) 54 (89%) 16 (84%)
    unknown 3 (1.5%) 0 (0%) 0 (0%)
    yes 32 (16%) 7 (11%) 3 (16%)
ALIVE
    alive 173 (84%) 51 (84%) 15 (79%)
    died 30 (15%) 9 (15%) 4 (21%)
    unknown 2 (1.0%) 1 (1.6%) 0 (0%)
PTSTAT
    0 26 (13%) 12 (20%) 3 (16%)
    1 65 (32%) 13 (21%) 5 (26%)
    2 55 (27%) 17 (28%) 4 (21%)
    3 59 (29%) 19 (31%) 7 (37%)
SEVGRADE
    mild 49 (24%) 0 (0%) 0 (0%)
    moderate 65 (32%) 20 (33%) 6 (32%)
    severe 91 (44%) 41 (67%) 13 (68%)
LOT
    1 62 (47%) 55 (90%) 16 (84%)
    2 40 (31%) 5 (8.2%) 2 (11%)
    3 21 (16%) 1 (1.6%) 1 (5.3%)
    4 5 (3.8%) 0 (0%) 0 (0%)
    5 3 (2.3%) 0 (0%) 0 (0%)
    A 0 (0%) 0 (0%) 0 (0%)
    Unknown 74 0 0
TRRESP
    complete response 29 (22%) 24 (39%) 0 (0%)
    no response 16 (12%) 0 (0%) 16 (84%)
    not estimated 13 (9.9%) 0 (0%) 0 (0%)
    other 1 (0.8%) 0 (0%) 0 (0%)
    partial response 70 (53%) 37 (61%) 0 (0%)
    progression 2 (1.5%) 0 (0%) 3 (16%)
    Unknown 74 0 0
ATCN
    BCR-ABL TYROSINE KINASE INHIBITORS 7 (5.3%) 0 (0%) 0 (0%)
    BRUTON'S TYROSINE KINASE (BTK) INHIBITORS 1 (0.8%) 0 (0%) 0 (0%)
    CALCINEURIN INHIBITORS 41 (31%) 0 (0%) 0 (0%)
    CORTICOSTEROIDS FOR SYSTEMIC USE 0 (0%) 0 (0%) 0 (0%)
    CORTICOSTEROIDS, DERMATOLOGICAL PREPARATIONS 0 (0%) 0 (0%) 0 (0%)
    CORTICOSTEROIDS, POTENT (GROUP III) 0 (0%) 1 (1.6%) 0 (0%)
    GLUCOCORTICOIDS 0 (0%) 60 (98%) 19 (100%)
    IMIDAZOLE DERIVATIVES 0 (0%) 0 (0%) 0 (0%)
    INTERLEUKIN INHIBITORS 0 (0%) 0 (0%) 0 (0%)
    INTERLEUKINS 4 (3.1%) 0 (0%) 0 (0%)
    JANUS-ASSOCIATED KINASE (JAK) INHIBITORS 50 (38%) 0 (0%) 0 (0%)
    OTHER IMMUNOSTIMULANTS 0 (0%) 0 (0%) 0 (0%)
    OTHER IMMUNOSUPPRESSANTS 1 (0.8%) 0 (0%) 0 (0%)
    SELECTIVE IMMUNOSUPPRESSANTS 21 (16%) 0 (0%) 0 (0%)
    TUMOR NECROSIS FACTOR ALPHA (TNF-) INHIBITORS 6 (4.6%) 0 (0%) 0 (0%)
    Unknown 74 0 0
1 n (%)
  #add_p()

====================================================================== Рабочее, но неоформленное

Развлечения с сompareGroups

# library(compareGroups)
# compareGroups(TDINTER ~ ., 
#               data = common_df_treatment_chronic,
#               method = c(triglyc = 2))

Описательные статистики рефрактерности: отберем пациентов, которые получали терапию кортикостероидами, расчитаем отдельно для группы с развившейся рефрактерностью и для группы без оной описательные статистики.

# common_df_treatment %>%
#   filter(GVHDTRYN == "yes",
#          ATCN %in% c("GLUCOCORTICOIDS", 
#                      "CORTICOSTEROIDS FOR SYSTEMIC USE", 
#                      "CORTICOSTEROIDS, POTENT (GROUP III)")) %>% 
#   select(SEX, TUTERM, COND, CONDTYPE, DRUGN, chronic_Joints,
#          ALIVE, GVHDAGE, TRIND, RESPEV, STERRES) %>% 
#   tbl_summary(by = STERRES) #%>% 
  #add_p()

                              Нерабочая часть

# common_df_treatment <- common_df_treatment %>% 
#   mutate(
#     GVHDTRYN = GVHDTRYN %>% as.factor(),
#     LOT = LOT %>% as.factor(),
#     ATCC = ATCC %>% as.factor(),
#     ATCN = ATCN %>% as.factor(),
#     TRRESP = TRRESP %>% as.factor(),
#     RESPEV = RESPEV %>% as.factor(),
#     TRONG = TRONG %>% as.factor(),
#     TRSTDTC = format(as.Date(TRSTDTC, format = "%d/%m/%Y"), "%d.%m.%Y"),
#     TRENDTC = format(as.Date(TRENDTC, format = "%d/%m/%Y"), "%d.%m.%Y")
#   )
# summary(common_df_treatment)

Теперь единица наблюдения – это случай получения одного препарата у одного пациента. Более аггрегированная статистика может быть получена при группировке по нужной переменной (номер пациента, тип РТПХ или оба).

Добавим информацию о факте резистентности.

Не очень понятно, как оценивать, какую линию терапии не смогли оценить/какая не показала резистентности, поскольку для подобных ситуаций не указан номер линии терапии. Оставим только данные о факте резистентности.

# common_df_resist <- left_join(common_df_treatment,
#                               fnet$RS_20230119_120304 %>%
#                                 filter(REFSTYN == "yes") %>% 
#                                 rename(LOT = REFSTLOT) %>% 
#                                 select(!SITE),
#                               by = c("SUBJID", "LOT"))

# После этого сделать проверку по дате: резистентность не может наступить раньше факта лечения.
# Вообще везде сделать проверку по дате